import pandas as pd
import numpy as np

#import sys
#args = sys.argv
#c = int(args[1])
c = 1000

df = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_{0}.csv'.format(c))

# Merge manual labels with dataframe
labels = pd.read_csv('../data/metadata/gpo_manual_cluster_labels/all_clusters_manually_labeled_{0}.csv'.format(c))
labels = labels[['ARG', 'cluster_label_manual']]

# Merge manual labels for ARG0
df = df.merge(labels, left_on='ARGO', right_on='ARG', how='left', indicator=True)
df.rename(columns={'ARGO': 'ARGO-freq'}, inplace=True)
df.rename(columns={'cluster_label_manual': 'ARGO'}, inplace=True)
df.drop(columns=['_merge', 'ARG'], inplace=True)

# Merge manual labels for ARG1
df = df.merge(labels, left_on='ARG1', right_on='ARG', how='left', indicator=True)
df.rename(columns={'ARG1': 'ARG1-freq'}, inplace=True)
df.rename(columns={'cluster_label_manual': 'ARG1'}, inplace=True)
df.drop(columns=['_merge', 'ARG'], inplace=True)

# Drop narratives where cluster does not have a manual label
print('Narratives all cluster labels:', len(df))
df = df[df['ARGO'] != 'noise']
df = df[df['ARG1'] != 'noise']

# Reconstruct narratives with manual labels
df['narrative'] = df['ARGO'] + ' ' + df['B-V-RAW'] + ' ' + df['ARG1']

# Re-calculate frequency
df['n'] = 1
df['frequency'] = df['n'].groupby(df['narrative']).transform('sum')

df.dropna(subset = ["narrative"], inplace=True)

# Annotate as noise a frequent narrative that captures mostly noise
df = df[df['narrative'] != 'company hold bank']

df.to_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_{0}.csv'.format(c), index=False)
